library(Stat2Data)
library(dplyr)
library(GGally)
library(gridExtra)
data("NFLStandings2016")
## isolate select stats for all teams
nfl_minimal = NFLStandings2016 %>%
select(WinPct, PointsFor, PointsAgainst)
head(nfl_minimal)
ggpairs(nfl_minimal)
winpct_pointsfor = summary(lm(WinPct ~ PointsFor, data = nfl_minimal))$r.squared
winpct_pointsagainst = summary(lm(WinPct ~ PointsAgainst, data = nfl_minimal))$r.squared
winpct_pointsfor
## [1] 0.3323022
winpct_pointsagainst
## [1] 0.46863
pointsfor_plot = ggplot(data = NFLStandings2016, aes(x = PointsFor, y = WinPct)) +
geom_point() +
geom_smooth(method = lm, se=F, formula = y ~ x) +
annotate(geom = "label", x = 475, y = 0.1,
label = paste0("R^2==", round(winpct_pointsfor, 2)),
parse = T, size = 6)
pointsagainst_plot = ggplot(data = NFLStandings2016, aes(x = PointsAgainst, y = WinPct)) +
geom_point() +
geom_smooth(method = lm, se=F, formula = y ~ x) +
annotate(geom = "label", x = 425, y = 0.85,
label = paste0("R^2==", round(winpct_pointsagainst, 2)),
parse = T, size = 6)
grid.arrange(pointsfor_plot, pointsagainst_plot, nrow = 1)
library(devtools)
library(regplanely)
library(equatiomatic)
winpct_multiple = lm(WinPct ~ PointsFor + PointsAgainst, data = nfl_minimal)
summary(winpct_multiple)
##
## Call:
## lm(formula = WinPct ~ PointsFor + PointsAgainst, data = nfl_minimal)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.149898 -0.073482 -0.006821 0.072569 0.213189
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.7853698 0.1537422 5.108 1.88e-05 ***
## PointsFor 0.0016992 0.0002628 6.466 4.48e-07 ***
## PointsAgainst -0.0024816 0.0003204 -7.744 1.54e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.09653 on 29 degrees of freedom
## Multiple R-squared: 0.7824, Adjusted R-squared: 0.7674
## F-statistic: 52.13 on 2 and 29 DF, p-value: 2.495e-10
extract_eq(winpct_multiple, use_coefs = T, coef_digits = 4)
\[ \operatorname{\widehat{WinPct}} = 0.7854 + 0.0017(\operatorname{PointsFor}) - 0.0025(\operatorname{PointsAgainst}) \]
regression_plane(winpct_multiple)
library(performance)
library(broom)
check_model(winpct_multiple, check = c("linearity", "homogeneity",
"qq", "normality"))
summary(winpct_multiple)$coefficients
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.785369843 0.1537421680 5.108357 1.876903e-05
## PointsFor 0.001699153 0.0002627934 6.465739 4.476431e-07
## PointsAgainst -0.002481576 0.0003204459 -7.744136 1.537195e-08
tidy(winpct_multiple, conf.int = T)
columns of the ANOVA tables from textbook:
ours has slightly different results, providing these stats for each explanatory variable in the model, taking into account the previous variable(s).
anova(winpct_multiple)
we can generate the one from the textbook by summing the rows of the ANOVA table:
anova(winpct_multiple) %>%
as_tibble() %>%
slice(1:2) %>% ## first two rows of the table
summarise(df = sum(Df),
`sum sq` = sum(`Sum Sq`)) %>%
mutate(`mean sq` = `sum sq`/df)
see line second from the bottom of the table
summary(winpct_multiple)
##
## Call:
## lm(formula = WinPct ~ PointsFor + PointsAgainst, data = nfl_minimal)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.149898 -0.073482 -0.006821 0.072569 0.213189
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.7853698 0.1537422 5.108 1.88e-05 ***
## PointsFor 0.0016992 0.0002628 6.466 4.48e-07 ***
## PointsAgainst -0.0024816 0.0003204 -7.744 1.54e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.09653 on 29 degrees of freedom
## Multiple R-squared: 0.7824, Adjusted R-squared: 0.7674
## F-statistic: 52.13 on 2 and 29 DF, p-value: 2.495e-10
predict(winpct_multiple, interval = "confidence")
## fit lwr upr
## 1 0.9143024 0.82272694 1.0058778
## 2 0.7413510 0.68174866 0.8009534
## 3 0.6745702 0.62362296 0.7255175
## 4 0.5368107 0.49004268 0.5835788
## 5 0.6953926 0.59054904 0.8002362
## 6 0.6073397 0.53716546 0.6775139
## 7 0.6518565 0.60556587 0.6981472
## 8 0.6622498 0.60297051 0.7215291
## 9 0.5565525 0.50359855 0.6095063
## 10 0.4591635 0.42279419 0.4955328
## 11 0.6141597 0.55507867 0.6732408
## 12 0.4848726 0.44832699 0.5214181
## 13 0.4454766 0.38264895 0.5083042
## 14 0.4711684 0.43570558 0.5066313
## 15 0.4947114 0.45755194 0.5318709
## 16 0.5077908 0.46698762 0.5485940
## 17 0.5715934 0.52495111 0.6182357
## 18 0.5109439 0.46410266 0.5577852
## 19 0.5791490 0.52370313 0.6345949
## 20 0.5972853 0.55202918 0.6425414
## 21 0.5252962 0.48466189 0.5659304
## 22 0.4556371 0.36627654 0.5449976
## 23 0.5875573 0.54635178 0.6287629
## 24 0.5558981 0.50347493 0.6083213
## 25 0.4147637 0.37193665 0.4575908
## 26 0.2376723 0.17199363 0.3033509
## 27 0.4323159 0.37453506 0.4900968
## 28 0.1882391 0.10327672 0.2732015
## 29 0.2692847 0.20772914 0.3308402
## 30 0.3330701 0.28452810 0.3816120
## 31 0.1192516 0.03129992 0.2072032
## 32 0.1122738 0.02697032 0.1975773
a data frame holding all possible combinations of the values…
points_grid = expand.grid(PointsFor = c(200, 300, 400),
PointsAgainst = c(200, 300, 400))
predict(winpct_multiple, new_data = points_grid,
interval = "prediction", level = 0.99)
## fit lwr upr
## 1 0.9143024 0.620993676 1.2076110
## 2 0.7413510 0.463411246 1.0192908
## 3 0.6745702 0.399774639 0.9493658
## 4 0.5368107 0.263368092 0.8102534
## 5 0.6953926 0.394122702 0.9966626
## 6 0.6073397 0.324952550 0.8897268
## 7 0.6518565 0.378561492 0.9251516
## 8 0.6622498 0.384435560 0.9400641
## 9 0.5565525 0.281068680 0.8320362
## 10 0.4591635 0.188607375 0.7297196
## 11 0.6141597 0.336422182 0.8918973
## 12 0.4848726 0.214273309 0.7554718
## 13 0.4454766 0.166249549 0.7247036
## 14 0.4711684 0.200830978 0.7415059
## 15 0.4947114 0.223960335 0.7654625
## 16 0.5077908 0.236088563 0.7794931
## 17 0.5715934 0.298189797 0.8449970
## 18 0.5109439 0.237478527 0.7844093
## 19 0.5791490 0.302776171 0.8555219
## 20 0.5972853 0.324305143 0.8702654
## 21 0.5252962 0.253639891 0.7969524
## 22 0.4556371 0.163571875 0.7477023
## 23 0.5875573 0.315744815 0.8593699
## 24 0.5558981 0.280598772 0.8311975
## 25 0.4147637 0.142496317 0.6870311
## 26 0.2376723 -0.042743822 0.5180884
## 27 0.4323159 0.155075654 0.7095562
## 28 0.1882391 -0.101432263 0.4779105
## 29 0.2692847 -0.009427268 0.5479966
## 30 0.3330701 0.059066474 0.6070736
## 31 0.1192516 -0.172035813 0.4105390
## 32 0.1122738 -0.177579586 0.4021272
library(moderndive)
library(equatiomatic)
library(performance)
data("ButterfliesBc")
head(ButterfliesBc)
generate a column in the data which refers to males + females as “m” or “f”
ButterfliesBc = ButterfliesBc %>%
mutate(label = recode(Sex, Male = "m", Female = "f"))
head(ButterfliesBc)
visualize the new data
ggplot(data = ButterfliesBc, mapping = aes(x = Temp,
y = Wing,
color = Sex)) +
geom_text(aes(label = label)) +
guides(label = "none", color = "none") +
theme_bw()
because geom_smooth() will fit each group with its own
simple linear model when visualizing, we have to use
geom_parallel_slopes() to visualize the two SLRMs.
ggplot(data = ButterfliesBc, mapping = aes(x = Temp,
y = Wing,
color = Sex)) +
geom_text(aes(label = label)) +
guides(label = "none", color = "none") +
geom_parallel_slopes(se = F) +
theme_bw()
to make a more traditional version of this model:
ggplot(data = ButterfliesBc, mapping = aes(x = Temp,
y = Wing,
color = Sex)) +
geom_point() +
geom_parallel_slopes(se = F) +
theme_bw()
in this model, the indicator variable IMale is referred
to instead as SexMale, which follows a
{NameOfVariable}{NameOfGroup} pattern for labeling
indicator variables.
wing_sex_model = lm(Wing ~ Temp + Sex, data = ButterfliesBc)
summary(wing_sex_model)$coefficients
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 19.3354676 0.13371975 144.596949 5.232477e-43
## Temp -0.2350416 0.05391279 -4.359664 1.495383e-04
## SexMale -0.9312500 0.08356482 -11.144043 5.349289e-12
interpreting this model:
(Intercept) is the y-axis intercept for the baseline
group (when sex = Male = 0 and Temp = 0), the Wing of the
butterfly.Temp is the average change in Wing for a
butterfly for each increase in Temp by one, regardless of
butterfly Sex (slopes are parallel).SexMale is the distance from any Sex = 1 (female) value
to a Sex = 0 (male) value.extract_eq(wing_sex_model, use_coefs = T)
\[ \operatorname{\widehat{Wing}} = 19.34 - 0.24(\operatorname{Temp}) - 0.93(\operatorname{Sex}_{\operatorname{Male}}) \]
to more easily isolate the indicator variable: \[\operatorname{\widehat{Wing}} = 19.34 - 0.24(\operatorname{Temp}) - 0.93 \cdot 1_{\operatorname{Male}}{(\operatorname{Sex}})\]
this model’s assumptions are only violated a small amount, mostly in the bottom left quadrant, with higher values.
check_model(wing_sex_model, check = c("qq", "normality", "linearity", "homogeneity"))
generate the data
data("Kids198")
head(Kids198)
before visualizing, we have to replace the indicator variable values for sex with something R will see as categorical/discrete, not numeric/continuous
Kids198 = Kids198 %>%
mutate(Sex = factor(Sex, labels = c("Male", "Female")))
head(Kids198)
visualize:
ggplot(data = Kids198, mapping = aes(x = Age,
y = Weight,
color = Sex)) +
geom_point() +
geom_smooth(method = lm, formula = y ~ x, se = F) +
theme_bw()
we can also fit this model using lm(), but we have to
modify the model itself
weight_age_model = lm(Weight ~ Age * Sex, data = Kids198)
summary(weight_age_model)$coefficients
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -33.6925380 10.00726703 -3.366807 9.168565e-04
## Age 0.9087098 0.06106202 14.881752 6.470598e-34
## SexFemale 31.8505655 13.24268959 2.405143 1.710556e-02
## Age:SexFemale -0.2812220 0.08163738 -3.444770 7.004052e-04
for interpreting this:
(Intercept) is the y-axis intercept for the baseline
group (indicator variable is 0, Male), and we can tell the baseline
group is Male because it’s not labelled in the coefficient
table.Age coefficient is the slope of the baseline group
(Male), because it doesn’t refer to any specific group.SexFemale represents the difference in intercept
between the two groups’ lines (the Female intercept is 31.9 above the
Male intercept)Age:SexFemale is the difference between the two groups’
slopes – distance from the Male group’s to the Female group’s.this means we have to infer the measures for the Female group’s intercept and slope from the Male group’s.
extract_eq(weight_age_model, use_coefs = F)
\[ \operatorname{Weight} = \alpha + \beta_{1}(\operatorname{Age}) + \beta_{2}(\operatorname{Sex}_{\operatorname{Female}}) + \beta_{3}(\operatorname{Age} \times \operatorname{Sex}_{\operatorname{Female}}) + \epsilon \] ## Section 3.4: New Predictors from Old (03-02-24)
library(Stat2Data)
library(regplanely)
data("Perch")
head(Perch)
perch_model = lm(Weight ~ Length * Width, data = Perch)
tidy(perch_model)
interpreting the terms:
(Interpret) the z-axis intercept, the predicted Y value
when \(X_1\) and \(X_2\) are 0Length the change in weight for each additional
centimeter of width (when the fish has length 0)Width the change in weight for each additional
centimeter of length (when the fish has width 0)Length:Width could be considered the change in slope of
the Weight x Width when length increases by 1, or the
change in slope of the Weight x Length when width
changes by 1.regression_plane(perch_model)